#    topic 16
#  This will be remarkably similar to topic 15, so
#  much so that it would be worth comparing the two scripts.

#  First, set up the situation.  We have a population
#  with an unknown standard deviation.
#
source("../gnrnd5.R")
source("../gnrnd4.R")
gnrnd5( 179453199903, 873001418)
big_pop <- L1
#  I do not know the standard deviation of big_pop

#  Now, someone says that they believe that the 
#  mean of big_pop is 200.  Thus our null hypothesis
#  is that the true mean is 200. We want to see if that
#  could be correct.  We will get a sample of 
#  size 43 and we will look at the sample mean.
#  We are willing to be wrong in telling the
#  person that they are wrong 2.5% of the time!

########################
##  The critical value approach. We know that
##  where the standard deviation of big_pop is sigma then
##  samples of size 43 will have a standard error
##  of the mean be sigma/sqrt(43) and that those
##  values will be normally distributed with the 
##  same mean as the population, a value we assume
##  to be 200. But we do not know the value of sigma.
##  So we can approximate the distribution of the
##  mean of samples of size 43 with a Student's-t
##  with 42 degrees of freedom.

##  This is a two-tailed test because
##  a sample mean that is either too low or too high
##  would indicate that 200 is not the mean.  
##  Therefore, find the value, in a Student's-t with
##  42 degrees for freedom that has 0.025/2 as the
##  P(X<t)  and the value that
##  has 0.025/2 as the P(X>t), where we use the sample 
##  standard deviation rather than the population
##  standard deviation.  Because of this we need to 
##  take the sample first in order to find the sample
##  standard deviation.

#  Get our sample
#  the first time we do this let us get the
#  same sample each time
gnrnd4(768734201, 200000001)
L1
# take those as the index values of our random sample
our_samp <- big_pop[ L1 ]
our_samp
#  find the mean of our sample
x_bar <- mean( our_samp )
x_bar
#  find the standard deviation of the sample
s_x <- sd( our_samp )
s_x

#  Now we can get our critical values
#  long way
low_t <- qt(0.025/2,42)
low_t
low_val <- 200 + low_t*s_x/sqrt(43) # + because t is < 0
low_val
high_t <- qt( 0.025/2,42, lower.tail=FALSE)
high_t  # this was silly because we know it is -low_t
high_val <- 200 + high_t*s_x/sqrt(43)
high_val



## so now compare the mean of the sample 
##  to our critical values.
x_bar
##    In this case the sample mean is greater than
##    our critical high.  Therefore reject, at the 
##    0.025 level of significance, the 
##    null hypothesis that the true mean is 200
##    in favor of the alternative hypothesis that
##    the true mean is not 200.

#############
#############  the attained significance approach
##   how strange would it be to get a mean of 
##  207.0581 for a sample of size 43 if the true
##  mean is 200?  Because we do not have the
##  flexibility in pt() that we have in pnorm()
##  we will have to normalize our value.
#
pt( (207.0581 - 200 )/(s_x/sqrt(43)), 42, 
    lower.tail=FALSE)
#
## but we would need to double that to account for
## values that extreme or more extreme on the low 
## side
0.005047642*2
#  is that probability less than our 2.5% ?
#  Yes, therefore, reject the null hypothesis
#  in favor of the alternative.

#######################   Now use the function to
#######################   do the same thing
source("../hypo_unknown.R")
hypoth_test_unknown( 200,0, 0.025,
                   43, x_bar, s_x )

#######################################
#######################################
#   Now we want to repeat this process
#   but each time we want a different sample
#   of size 35
L1 <- sample( big_pop, 43 )
x_bar <- mean( L1 )
s_x <- sd( L1 )
hypoth_test_unknown( 200,  0, 0.025,
                   43, x_bar, s_x )
#### perform lines 114-118 again and again

###  now, since we have the population let us peek
#    at the true mean
mean( big_pop )
#######   Try our samples again, but this time test
##        the null hypothesis that the true mean
##        is 207.0618, and do the test at the 0.05 
##        level of significance.  
L1 <- sample( big_pop, 43 )
x_bar <- mean( L1 )
s_x <- sd( L1 )
hypoth_test_unknown( 207.0618,  0, 0.05,
                     43, x_bar, s_x )
#### perform lines 128-132 again and again,
#### and we should see a Type I error about 
#### 5% of the time.

###  we can actually do this 1000 times and see how 
### times we reject  the null hypothesis even 
###  though it is true.
L2 <- 1:1000
for( i in 1:1000) {
  L1 <- sample( big_pop, 43 )
  x_bar <- mean( L1 )
  s_x <- sd( L1 )
  answer <- hypoth_test_unknown( 207.0618,  0, 0.05,
                       43, x_bar, s_x )  
  L2[i] <- answer[14]
}
table( L2 )